package com.galeno.sparksql

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.types.{DataType, DataTypes, StructField, StructType}
import org.apache.spark.sql.{DataFrame, SparkSession, types}

/**
 * @Title: ${file_name}
 * @Description: ${todo}
 * @author galeno
 * @date 2021/9/420:10
 */
object SparkSql02 {
  def main(args: Array[String]): Unit = {

    val spark = SparkSession.builder()
      .appName("")
      .master("local")
      .getOrCreate()
    Logger.getLogger("abc").setLevel(Level.WARN)
    //从csv文件创建Df,
    val df1: DataFrame = spark.read.csv("data/battel.txt")
    df1.printSchema()
    //从csv文件创建df,并未字段重命名
    val df2 = spark.read.csv("data/battel.txt").toDF("id", "name", "role", "power")
    df2.printSchema()

    val df3 = spark.read.option("inferSchema", "true").csv("data/battel.txt").toDF("id", "name", "role", "power")
    df3.printSchema()
    //从不带CSV文件创建df,且自定义Schema
    val df4 = spark.read.schema(StructType(
      Seq(
        StructField("id", DataTypes.IntegerType),
        StructField("name", DataTypes.StringType),
        StructField("role", DataTypes.StringType),
        StructField("power", DataTypes.LongType)
      )
    )).csv("data/battel.txt")
    df4.printSchema()
    df4.write.parquet("data/parquet/")
    // 从带表头的CSV文件创建DF : （5）
    // 设置一个option选项： header=true，则会将文件的第一行认作“表头”，不要当成数据
    // 正确的schema还是需要自己传入
    //  val arr = Array("header","true", "inferSchema","true")
   val df5=  spark.read
      .option("header", "true")
      .schema(
        StructType(
          Seq(
            StructField("id",DataTypes.IntegerType),
            StructField("name",DataTypes.StringType),
            StructField("role",DataTypes.StringType),
            StructField("power",DataTypes.LongType),
          )
        )
      ).csv("data/battel2.txt")
    df5.printSchema()
    df5.show()


  }

}
